## Pelc 2011 WP

library(foreign) 
library(Amelia)

## Load original dataset
p2011wp <- read.dta("P2011 WP Rep Data.dta")
head(p2011wp)
dim(p2011wp)
summary(p2011wp)

## Drop ID and other index vars
p2011wp$reporternamepolity <- p2011wp$countrycode <- p2011wp$countrycode <- p2011wp$polconcountrycode <- p2011wp$product <- p2011wp$twodigit <- p2011wp$WTOyr <- p2011wp$year <- NULL

## Drop derived dummy vars
p2011wp$bnd2005 <- p2011wp$chinadummy <- p2011wp$xchangedum1 <-p2011wp$xchangedum2 <-p2011wp$xchangedum3 <-p2011wp$xchangedum4 <-p2011wp$xchangedum5 <-p2011wp$xchangedum6 <-p2011wp$xchangedum945 <-p2011wp$xchangedum946 <-p2011wp$pre95ADdummy <- p2011wp$USdum <- p2011wp$Canadadum <- p2011wp$AUSdum <- p2011wp$countrydum1 <-p2011wp$countrydum2 <-p2011wp$countrydum3 <-p2011wp$countrydum4 <- NULL

## Drop constituent items
p2011wp$agri_ahs <- p2011wp$agri_bnd <- p2011wp$agri_mfn <- NULL

## Drop already imputed variables
p2011wp$pre95ADfilled <- NULL

## Drop calculated parameters
p2011wp$b1 <-p2011wp$b2 <-p2011wp$b3 <-p2011wp$b4 <-p2011wp$b5 <-p2011wp$b6 <-p2011wp$b7 <-p2011wp$b8 <-p2011wp$b9 <-p2011wp$b10 <- p2011wp$meanpre95ahs <- p2011wp$bnd3 <- p2011wp$bnd2005 <- p2011wp$overhang2005 <- p2011wp$mfn06 <- p2011wp$mfn2005 <- NULL

which( colnames(p2011wp)=="_merge_polity" )
which( colnames(p2011wp)=="_mergeDPI" )
which( colnames(p2011wp)=="_est_t2_1" )
which( colnames(p2011wp)=="_est_t2_2" )
which( colnames(p2011wp)=="_est_t2_3" )
which( colnames(p2011wp)=="_est_t1_1" )
which( colnames(p2011wp)=="_est_t1_2" )
which( colnames(p2011wp)=="_est_t1_3" )
which( colnames(p2011wp)=="_merge" )

p2011wp <- p2011wp[, -c(13, 34, 51, 52, 53, 54, 55, 56, 69)]

## How many variables? 70: reduction necessary due to large sample size
dim(p2011wp)

## Which variables are in analysis and have missing data?
sum(is.na(p2011wp $reporter))/nrow(p2011wp)*100 ## None
sum(is.na(p2011wp $overhang1))/nrow(p2011wp)*100
sum(is.na(p2011wp $mfn))/nrow(p2011wp)*100
sum(is.na(p2011wp $xchangedum944))/nrow(p2011wp)*100
sum(is.na(p2011wp $logGDPcap))/nrow(p2011wp)*100
sum(is.na(p2011wp $logGDPconst))/nrow(p2011wp)*100
sum(is.na(p2011wp $xpolity))/nrow(p2011wp)*100
sum(is.na(p2011wp $logimportsprod))/nrow(p2011wp)*100
sum(is.na(p2011wp $ldc))/nrow(p2011wp)*100 ## None
sum(is.na(p2011wp $agri))/nrow(p2011wp)*100 ## None
sum(is.na(p2011wp $recententrant))/nrow(p2011wp)*100 ## None
sum(is.na(p2011wp $xchangedum941))/nrow(p2011wp)*100
sum(is.na(p2011wp $xchangedum942))/nrow(p2011wp)*100
sum(is.na(p2011wp $xchangedum943))/nrow(p2011wp)*100
sum(is.na(p2011wp $xchange1994))/nrow(p2011wp)*100
sum(is.na(p2011wp $ADuserconsist95))/nrow(p2011wp)*100
sum(is.na(p2011wp $pre95AD))/nrow(p2011wp)*100
sum(is.na(p2011wp $logfullimports))/nrow(p2011wp)*100
sum(is.na(p2011wp $meanpre95mfn))/nrow(p2011wp)*100
sum(is.na(p2011wp $mfn))/nrow(p2011wp)*100
sum(is.na(p2011wp $reportername))/nrow(p2011wp)*100 ## None
sum(is.na(p2011wp $bnd))/nrow(p2011wp)*100

analysis <- as.data.frame(cbind(p2011wp$overhang1, p2011wp$mfn, p2011wp$xchangedum944, p2011wp$logGDPcap, p2011wp$logGDPconst, p2011wp$xpolity,p2011wp$logimportsprod,p2011wp$xchangedum941,p2011wp$xchangedum942,p2011wp$xchangedum943,p2011wp$xchange1994,p2011wp$ADuserconsist95,p2011wp$pre95AD,p2011wp$logfullimports,p2011wp$meanpre95mfn,p2011wp$bnd))

missing <- as.data.frame(cbind(as.integer(complete.cases(p2011wp$overhang1)), as.integer(complete.cases(p2011wp$mfn)), as.integer(complete.cases(p2011wp$xchangedum944)), as.integer(complete.cases(p2011wp$logGDPcap)), as.integer(complete.cases(p2011wp$logGDPconst)), as.integer(complete.cases(p2011wp$xpolity)), as.integer(complete.cases(p2011wp$logimportsprod)), as.integer(complete.cases(p2011wp$xchangedum941)), as.integer(complete.cases(p2011wp$xchangedum942)), as.integer(complete.cases(p2011wp$xchangedum943)), as.integer(complete.cases(p2011wp$xchange1994)), as.integer(complete.cases(p2011wp$ADuserconsist95)), as.integer(complete.cases(p2011wp$pre95AD)), as.integer(complete.cases(p2011wp$logfullimports)), as.integer(complete.cases(p2011wp$meanpre95mfn)), as.integer(complete.cases(p2011wp$bnd))))

dim(analysis)
dim(missing)
apply(missing, 2, sd)

## Remove analysis variables
## p2011wp$overhang1 <- p2011wp$mfn <- p2011wp$xchangedum944 <- p2011wp$logGDPcap <- p2011wp$logGDPconst <- p2011wp$xpolity <- p2011wp$logimportsprod <-p2011wp$xchangedum941 <- p2011wp$xchangedum942 <- p2011wp$xchangedum943 <- p2011wp$xchange1994 <- p2011wp$ADuserconsist95 <- p2011wp$pre95AD <- p2011wp$logfullimports <- p2011wp$meanpre95mfn <- p2011wp$bnd <- NULL
## head(p2011wp)
## var(p2011wp)
 
 ## Check round(correlations and missing values
round(cor(p2011wp$prf, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$prf, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$prf))/nrow(p2011wp)*100
## N
p2011wp$prf <- NULL

round(cor(p2011wp$importsvalue000, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$importsvalue000, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$importsvalue000))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$ahs, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ahs, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ahs))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$overhang2, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$overhang2, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$overhang2))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$importsvalue000, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$importsvalue000, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$importsvalue000))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$xconst, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$xconst, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$xconst))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$Xchange, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$Xchange, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$Xchange))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$exportsGDPperc, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$exportsGDPperc, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$exportsGDPperc))/nrow(p2011wp)*100
## N
p2011wp$exportsGDPperc <- NULL

round(cor(p2011wp$fdi, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$fdi, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$fdi))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$GDPconst, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$GDPconst, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$GDPconst))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$GDPcap, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$GDPcap, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$GDPcap))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$importsGDPperc, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$importsGDPperc, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$importsGDPperc))/nrow(p2011wp)*100
## N
p2011wp$importsGDPperc <- NULL

round(cor(p2011wp$yrcurnt, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$yrcurnt, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$yrcurnt))/nrow(p2011wp)*100
## N
p2011wp$yrcurnt <- NULL

round(cor(p2011wp$allhouse, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$allhouse, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$allhouse))/nrow(p2011wp)*100
## N
p2011wp$allhouse <- NULL

round(cor(p2011wp$eiec, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$eiec, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$eiec))/nrow(p2011wp)*100
## N
p2011wp$eiec <- NULL

round(cor(p2011wp$checks, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$checks, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$checks))/nrow(p2011wp)*100
## N
p2011wp$checks <- NULL

round(cor(p2011wp$stabs, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$stabs, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$stabs))/nrow(p2011wp)*100
## N
p2011wp$stabs <- NULL

round(cor(p2011wp$polariz, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$polariz, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$polariz))/nrow(p2011wp)*100
## N
p2011wp$polariz <- NULL

round(cor(p2011wp$polconiii, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$polconiii, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$polconiii))/nrow(p2011wp)*100
## N
p2011wp$polconiii <- NULL

round(cor(p2011wp$polconv, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$polconv, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$polconv))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$l1, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$l1, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$l1))/nrow(p2011wp)*100
## N
p2011wp$l1 <- NULL

round(cor(p2011wp$l2, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$l2, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$l2))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$xconstfrompolity, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$xconstfrompolity, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$xconstfrompolity))/nrow(p2011wp)*100
## N
p2011wp$xconstfrompolity <- NULL

round(cor(p2011wp$polityfrompolity, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$polityfrompolity, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$polityfrompolity))/nrow(p2011wp)*100
## N
p2011wp$polityfrompolity <- NULL

round(cor(p2011wp$logfdi, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$logfdi, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$logfdi))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$deving, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$deving, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$deving))/nrow(p2011wp)*100
## N
p2011wp$deving <- NULL

round(cor(p2011wp$ldc, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ldc, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ldc))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$prefmargin, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$prefmargin, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$prefmargin))/nrow(p2011wp)*100
## N
p2011wp$prefmargin <- NULL

round(cor(p2011wp$developed, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$developed, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$developed))/nrow(p2011wp)*100
## N
p2011wp$developed <- NULL

round(cor(p2011wp$fullimports, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$fullimports, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$fullimports))/nrow(p2011wp)*100
## N
p2011wp$fullimports <- NULL

round(cor(p2011wp$ptaB25, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ptaB25, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ptaB25))/nrow(p2011wp)*100
## N
p2011wp$ptaB25 <- NULL

round(cor(p2011wp$agri_overhang1, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$agri_overhang1, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$agri_overhang1))/nrow(p2011wp)*100
## N
p2011wp$agri_overhang1 <- NULL

round(cor(p2011wp$agri_overhang2, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$agri_overhang2, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$agri_overhang2))/nrow(p2011wp)*100
## N
p2011wp$agri_overhang2 <- NULL

round(cor(p2011wp$agri4country, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$agri4country, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$agri4country))/nrow(p2011wp)*100
## N
p2011wp$agri4country <- NULL

round(cor(p2011wp$adlaw94, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$adlaw94, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$adlaw94))/nrow(p2011wp)*100
## N
p2011wp$adlaw94 <- NULL

round(cor(p2011wp$adlaw95, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$adlaw95, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$adlaw95))/nrow(p2011wp)*100
## N
p2011wp$adlaw95 <- NULL

round(cor(p2011wp$adlaw96, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$adlaw96, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$adlaw96))/nrow(p2011wp)*100
## N
p2011wp$adlaw96 <- NULL

round(cor(p2011wp$adlaw97, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$adlaw97, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$adlaw97))/nrow(p2011wp)*100
## N
p2011wp$adlaw97 <- NULL

round(cor(p2011wp$aduserconsist94, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$aduserconsist94, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$aduserconsist94))/nrow(p2011wp)*100
## N
p2011wp$aduserconsist94 <- NULL

round(cor(p2011wp$aduserconsist95, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$aduserconsist95, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$aduserconsist95))/nrow(p2011wp)*100
## N
p2011wp$aduserconsist95 <- NULL

round(cor(p2011wp$aduserconsist96, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$aduserconsist96, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$aduserconsist96))/nrow(p2011wp)*100
## N
p2011wp$aduserconsist96 <- NULL

round(cor(p2011wp$aduserconsist97, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$aduserconsist97, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$aduserconsist97))/nrow(p2011wp)*100
## N
p2011wp$aduserconsist97 <- NULL

round(cor(p2011wp$ADuserconsist94, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ADuserconsist94, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ADuserconsist94))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$ADuserconsist96, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ADuserconsist96, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ADuserconsist96))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$ADuserconsist97, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ADuserconsist97, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ADuserconsist97))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$ADlaw97, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ADlaw97, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ADlaw97))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$ADlaw96, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ADlaw96, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ADlaw96))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$ADlaw95, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ADlaw95, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ADlaw95))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$ADlaw94, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$ADlaw94, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$ADlaw94))/nrow(p2011wp)*100
## Y

round(cor(p2011wp$xchangeXAD, analysis, use = "pairwise.complete.obs"), 2)
round(cor(p2011wp$xchangeXAD, missing, use = "pairwise.complete.obs"), 2)
sum(is.na(p2011wp$xchangeXAD))/nrow(p2011wp)*100
## Y

## Imputation
case <-c(1:nrow(p2011wp))
p2011wp <- cbind(p2011wp, case)
head(p2011wp)

## What is average percentage of missing data?
NAs <- function(x) {
    as.vector(apply(x, 2, function(x) length(which(is.na(x)))))
    }
NAs(p2011wp)
mean(NAs(p2011wp)/nrow(p2011wp))*100

## Thus: 14 imputations

set.seed(02138)
p2011wp.out <- amelia(p2011wp, m = 14, idvars = "reportername", cs = "reporter", empri = 0.01*nrow(p2011wp))

write.amelia(obj=p2011wp.out, file.stem = "P2011 WP Imp Data", format = "dta", separate = FALSE)

